EDA

An overview of the basic lay of the land of this data shows that:

employees <- read.csv("CaseStudy2-data.csv")

str(employees)
## 'data.frame':    870 obs. of  36 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : int  4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : int  2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : int  3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : int  2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
##  $ JobSatisfaction         : int  4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : int  3 3 3 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 3 3 3 1 3 4 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 2 0 2 0 3 1 1 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : int  3 2 2 3 2 4 5 5 2 3 ...
##  $ WorkLifeBalance         : int  2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...
colSums(is.na(employees))
##                       ID                      Age                Attrition 
##                        0                        0                        0 
##           BusinessTravel                DailyRate               Department 
##                        0                        0                        0 
##         DistanceFromHome                Education           EducationField 
##                        0                        0                        0 
##            EmployeeCount           EmployeeNumber  EnvironmentSatisfaction 
##                        0                        0                        0 
##                   Gender               HourlyRate           JobInvolvement 
##                        0                        0                        0 
##                 JobLevel                  JobRole          JobSatisfaction 
##                        0                        0                        0 
##            MaritalStatus            MonthlyIncome              MonthlyRate 
##                        0                        0                        0 
##       NumCompaniesWorked                   Over18                 OverTime 
##                        0                        0                        0 
##        PercentSalaryHike        PerformanceRating RelationshipSatisfaction 
##                        0                        0                        0 
##            StandardHours         StockOptionLevel        TotalWorkingYears 
##                        0                        0                        0 
##    TrainingTimesLastYear          WorkLifeBalance           YearsAtCompany 
##                        0                        0                        0 
##       YearsInCurrentRole  YearsSinceLastPromotion     YearsWithCurrManager 
##                        0                        0                        0
summary(employees$MonthlyIncome)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1081    2840    4946    6390    8182   19999
summary(employees$Attrition)
##  No Yes 
## 730 140
#useless variable with one value across the whole thing
employees$Over18 <- NULL

Are any ordinal variables going in as continuous right now? yes

employees$JobInvolvement[which(employees$JobInvolvement == 1)] = 'Low'
employees$JobInvolvement[which(employees$JobInvolvement == 2)] = 'Medium'
employees$JobInvolvement[which(employees$JobInvolvement == 3)] = 'High'
employees$JobInvolvement[which(employees$JobInvolvement == 4)] = 'Very High'
employees$JobInvolvement = as.factor(employees$JobInvolvement)
summary(employees$JobInvolvement)
##      High       Low    Medium Very High 
##       514        47       228        81
employees$JobSatisfaction[which(employees$JobSatisfaction == 1)] = 'Low'
employees$JobSatisfaction[which(employees$JobSatisfaction == 2)] = 'Medium'
employees$JobSatisfaction[which(employees$JobSatisfaction == 3)] = 'High'
employees$JobSatisfaction[which(employees$JobSatisfaction == 4)] = 'Very High'
employees$JobSatisfaction = as.factor(employees$JobSatisfaction)
summary(employees$JobSatisfaction)
##      High       Low    Medium Very High 
##       254       179       166       271
employees$PerformanceRating[which(employees$PerformanceRating == 1)] = 'Low'
employees$PerformanceRating[which(employees$PerformanceRating == 2)] = 'Good'
employees$PerformanceRating[which(employees$PerformanceRating == 3)] = 'Excellent'
employees$PerformanceRating[which(employees$PerformanceRating == 4)] = 'Outstanding'
employees$PerformanceRating = as.factor(employees$PerformanceRating)
summary(employees$PerformanceRating)
##   Excellent Outstanding 
##         738         132
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 1)] = 'Low'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 2)] = 'Medium'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 3)] = 'High'
employees$RelationshipSatisfaction[which(employees$RelationshipSatisfaction == 4)] = 'Very High'
employees$RelationshipSatisfaction = as.factor(employees$RelationshipSatisfaction)
summary(employees$RelationshipSatisfaction)
##      High       Low    Medium Very High 
##       261       174       171       264
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 1)] = 'Bad'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 2)] = 'Good'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 3)] = 'Better'
employees$WorkLifeBalance[which(employees$WorkLifeBalance == 4)] = 'Best'
employees$WorkLifeBalance = as.factor(employees$WorkLifeBalance)
summary(employees$WorkLifeBalance)
##    Bad   Best Better   Good 
##     48     98    532    192
employees$Education[which(employees$Education == 1)] = 'Below College'
employees$Education[which(employees$Education == 2)] = 'College'
employees$Education[which(employees$Education == 3)] = 'Bachelor'
employees$Education[which(employees$Education == 4)] = 'Master'
employees$Education[which(employees$Education == 5)] = 'Doctor'
employees$Education = as.factor(employees$Education)
summary(employees$WorkLifeBalance)
##    Bad   Best Better   Good 
##     48     98    532    192
employees$EnvironmentSatisfaction[which(employees$EnvironmentSatisfaction == 1)] = 'Low'
employees$EnvironmentSatisfaction[which(employees$EnvironmentSatisfaction == 2)] = 'Medium'
employees$EnvironmentSatisfaction[which(employees$EnvironmentSatisfaction == 3)] = 'High'
employees$EnvironmentSatisfaction[which(employees$EnvironmentSatisfaction == 4)] = 'Very High'
employees$EnvironmentSatisfaction = as.factor(employees$EnvironmentSatisfaction)
summary(employees$EnvironmentSatisfaction)
##      High       Low    Medium Very High 
##       258       172       178       262
employees$StockOptionLevel = factor(employees$StockOptionLevel)
summary(employees$StockOptionLevel)
##   0   1   2   3 
## 379 355  81  55

Highest job satisfaction vs job role

library(ggmosaic)

ggplot(data = employees) +
   geom_mosaic(aes(x = product(JobSatisfaction, JobRole), fill=JobRole), na.rm=TRUE) + labs(x = "Job Role", title='Job Satisfaction in Job Roles', y='Job Satisfaction')+
theme(axis.text.x = element_text(angle = 90))

#Chi square says no significant difference in job satisfaction across education fields
table(employees$JobSatisfaction, employees$JobRole)
##            
##             Healthcare Representative Human Resources Laboratory Technician
##   High                             23               8                    43
##   Low                              16               5                    32
##   Medium                            9               8                    31
##   Very High                        28               6                    47
##            
##             Manager Manufacturing Director Research Director Research Scientist
##   High           12                     29                16                 48
##   Low            12                     12                13                 32
##   Medium         14                     23                11                 31
##   Very High      13                     23                11                 61
##            
##             Sales Executive Sales Representative
##   High                   61                   14
##   Low                    48                    9
##   Medium                 25                   14
##   Very High              66                   16
chisq.test(table(employees$JobSatisfaction, employees$JobRole))
## 
##  Pearson's Chi-squared test
## 
## data:  table(employees$JobSatisfaction, employees$JobRole)
## X-squared = 26.048, df = 24, p-value = 0.3507

job role vs education field: Those who got Marketing or Human Resources degrees were all either in HR, management, or sales; all the research-related jobs had science, medical, technical, or other degree.

ggplot(data = employees) +
   geom_mosaic(aes(x = product(EducationField, JobRole), fill=EducationField), na.rm=TRUE) + labs(y = "Education Field", title='Education Field of Job Roles', x='Job Role') +
theme(axis.text.x = element_text(angle = 90))

income job role: HR, lab technician, research scientist, and sales representatives make the least money. Managers and Research directors make the most money. Healthcare representatives, Manufacturing directors, and sales executives make in the middle.

ggplot(data = employees, aes(x=JobRole, y=MonthlyIncome, fill=JobRole)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90)) +
  ggtitle("Incomes of Job Roles")

Working years vs job roles: Sales represntative and HR seem to be the most entry-level type job, whereas manager and director have generally been working a longer time than those in other roles.

ggplot(data = employees, aes(x=JobRole, y=TotalWorkingYears, fill=JobRole)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90)) +
  ggtitle("How Long People of Job Roles Have Worked")

JobInvolvement of job roles: All the job roles have the same level of job involvement, at least according to this data.

ggplot(data = employees) +
   geom_mosaic(aes(x = product(factor(JobInvolvement), JobRole), fill=factor(JobInvolvement)), na.rm=TRUE) + labs(y = "Job Involvement", title='Involvment of Job Roles', x='Job Role') +
theme(axis.text.x = element_text(angle = 90))

What continuous variables have a difference in distribution of attrition? pairs plots continuous

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:ggmosaic':
## 
##     happy
## The following object is masked from 'package:dplyr':
## 
##     nasa
#there's 22 continuous variables
employees %>%
  select_if(is.numeric) %>%
  dim()
## [1] 870  19
employees %>%
  select_if(is.numeric) %>%
  select(1:5) %>%
  mutate(Attrition = employees$Attrition) %>%
  sample_n(200) %>%
  ggpairs(aes(colour = Attrition)) + 
  ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

employees %>%
  select_if(is.numeric) %>%
  select(6:10) %>%
  mutate(Attrition = employees$Attrition) %>%
  sample_n(200) %>%
  ggpairs(aes(colour = Attrition)) + 
  ggtitle("Pairs Plot")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

employees %>%
  select_if(is.numeric) %>%
  select(11:15) %>%
  mutate(Attrition = employees$Attrition) %>%
  sample_n(200) %>%
  ggpairs(aes(colour = Attrition)) + 
  ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

employees %>%
  select_if(is.numeric) %>%
  select(16:19) %>%
  mutate(Attrition = employees$Attrition) %>%
  sample_n(200) %>%
  ggpairs(aes(colour = Attrition)) + 
  ggtitle("Pairs Plot")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Check for multicollinearity between continuous variables

library(corrplot)
## corrplot 0.84 loaded
## corrplot 0.84 loaded
M <- employees %>%
  select_if(is.numeric) %>%
  cor()
## Warning in cor(.): the standard deviation is zero
corrplot(M, method = "circle")

What categorical variables have a difference in distribution in response?

#there's 14 categorical variables
employees %>%
  select_if(is.factor) %>%
  dim()
## [1] 870  16
categs <- names(select_if(employees, is.factor))

for(i in 1:length(categs)){
  print(employees %>%
    ggplot(aes(x = eval(parse(text=categs[i])), fill = Attrition)) +
    geom_bar(position = "fill") +
    xlab(categs[i])
  )
}

setting up balanced train test split

split_train_test <- function(df) {
  # dataset with "no"
  data_no = df[which(df$Attrition=="No"),]
  # dataset with "yes"
  data_yes = df[which(df$Attrition=="Yes"),]
  
  #making more folds on No to balance the number with Yes 
  folds_no = createFolds(data_no$Attrition, k=8)
  folds_yes = createFolds(data_yes$Attrition, k=2)
  length(folds_no$Fold1)
  length(folds_no$Fold2)
  length(folds_yes$Fold1)
  length(folds_yes$Fold2)
  
  #Train
  train_no = data_no[folds_no$Fold1,]
  train_yes = data_yes[folds_yes$Fold1,]
  train = rbind(train_no, train_yes)
  
  #Test
  test_no = data_no[c(folds_no$Fold2, folds_no$Fold3, folds_no$Fold4, folds_no$Fold5),]
  test_yes = data_yes[folds_yes$Fold2,]
  test = rbind(test_no, test_yes)
  
  return(list(train, test))
}

Predict attrition - Naive Bayes

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(plotROC)

#naive bayes
#balanced training set
train1 <- split_train_test(employees)[[1]]
test1 <- split_train_test(employees)[[2]]
model.nb1 <- naiveBayes(Attrition ~ ., data=train1)
preds.nb1 <- predict(model.nb1, test1)
confusionMatrix(table(preds.nb1, test1$Attrition))
## Confusion Matrix and Statistics
## 
##          
## preds.nb1  No Yes
##       No  256  18
##       Yes 108  52
##                                          
##                Accuracy : 0.7097         
##                  95% CI : (0.6645, 0.752)
##     No Information Rate : 0.8387         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2937         
##                                          
##  Mcnemar's Test P-Value : 2.214e-15      
##                                          
##             Sensitivity : 0.7033         
##             Specificity : 0.7429         
##          Pos Pred Value : 0.9343         
##          Neg Pred Value : 0.3250         
##              Prevalence : 0.8387         
##          Detection Rate : 0.5899         
##    Detection Prevalence : 0.6313         
##       Balanced Accuracy : 0.7231         
##                                          
##        'Positive' Class : No             
## 
preds.nb1 <- predict(model.nb1, test1, type = "raw")
preds.nb1 <- prediction(preds.nb1[,2], test1$Attrition)
roc.perf_1 = performance(preds.nb1, measure = "tpr", x.measure = "fpr")

#unbalanced training set
folds <- createFolds(employees$Attrition, k=2)
train2 <- employees[folds$Fold1,]
test2 <- employees[folds$Fold2,]
model.nb2 <- naiveBayes(Attrition ~ ., data=train2)
preds.nb2 <- predict(model.nb2, test2)
confusionMatrix(table(preds.nb2, test2$Attrition))
## Confusion Matrix and Statistics
## 
##          
## preds.nb2  No Yes
##       No  299  32
##       Yes  66  38
##                                           
##                Accuracy : 0.7747          
##                  95% CI : (0.7325, 0.8131)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.9998141       
##                                           
##                   Kappa : 0.3026          
##                                           
##  Mcnemar's Test P-Value : 0.0008576       
##                                           
##             Sensitivity : 0.8192          
##             Specificity : 0.5429          
##          Pos Pred Value : 0.9033          
##          Neg Pred Value : 0.3654          
##              Prevalence : 0.8391          
##          Detection Rate : 0.6874          
##    Detection Prevalence : 0.7609          
##       Balanced Accuracy : 0.6810          
##                                           
##        'Positive' Class : No              
## 
preds.nb2 <- predict(model.nb2, test2, type = "raw")
preds.nb2 <- prediction(preds.nb2[,2], test2$Attrition)
roc.perf_2 = performance(preds.nb2, measure = "tpr", x.measure = "fpr")

#the balanced one definitely looks better and .25 seems a good threshold
plot(roc.perf_1, col="red")
plot(roc.perf_2, add = TRUE, col="blue")

make numeric and make standardize scale data for KNN

#re-read in data
employees_num <- read.csv("CaseStudy2-data.csv")

#get rid of useless variables
employees_num$Over18 <- NULL
employees_num$ID <- NULL

#numericize categorical variables
employees_num$BusinessTravel = as.numeric(employees_num$BusinessTravel)
employees_num$Department = as.numeric(employees_num$Department)
employees_num$EducationField = as.numeric(employees_num$EducationField)
employees_num$Gender = as.numeric(employees_num$Gender)
employees_num$JobRole = as.numeric(employees_num$JobRole)
employees_num$MaritalStatus = as.numeric(employees_num$MaritalStatus)
employees_num$OverTime = as.numeric(employees_num$OverTime)

n <- dim(employees_num)[2]

#scale
employees_z <- employees_num %>%
mutate(zAge = scale(Age))  %>% 
mutate(zBusinessTravel = scale(BusinessTravel)) %>%
mutate(zDailyRate = scale(DailyRate)) %>%
mutate(zDepartment = scale(Department)) %>%
mutate(zDistanceFromHome = scale(DistanceFromHome)) %>%
mutate(zEducation = scale(Education)) %>%
mutate(zEducationField = scale(EducationField)) %>%
mutate(zEmployeeCount = scale(EmployeeCount)) %>%
mutate(zEmployeeNumber = scale(EmployeeNumber)) %>%
mutate(zEnvironmentSatisfaction = scale(EnvironmentSatisfaction)) %>%
mutate(zGender = scale(Gender)) %>%
mutate(zHourlyRate = scale(HourlyRate))           %>%
mutate(zJobInvolvement = scale(JobInvolvement)) %>%
mutate(zJobLevel = scale(JobLevel)) %>%
mutate(zJobRole = scale(JobRole)) %>%
mutate(zJobSatisfaction = scale(JobSatisfaction)) %>%
mutate(zMaritalStatus = scale(MaritalStatus)) %>%
mutate(zMonthlyIncome = scale(MonthlyIncome)) %>%
mutate(zMonthlyRate = scale(MonthlyRate)) %>%
mutate(zNumCompaniesWorked = scale(NumCompaniesWorked)) %>%
mutate(zOverTime = scale(OverTime)) %>%
mutate(zPercentSalaryHike = scale(PercentSalaryHike)) %>%
mutate(zPerformanceRating = scale(PerformanceRating)) %>%
mutate(zRelationshipSatisfaction = scale(RelationshipSatisfaction)) %>%
mutate(zStandardHours = scale(StandardHours)) %>%
mutate(zStockOptionLevel = scale(StockOptionLevel)) %>%
mutate(zTotalWorkingYears = scale(TotalWorkingYears)) %>%
mutate(zTrainingTimesLastYear = scale(TrainingTimesLastYear)) %>%
mutate(zWorkLifeBalance = scale(WorkLifeBalance)) %>%
mutate(zYearsAtCompany = scale(YearsAtCompany)) %>%
mutate(zYearsInCurrentRole = scale(YearsInCurrentRole)) %>%
mutate(zYearsSinceLastPromotion = scale(YearsSinceLastPromotion)) %>%
mutate(zYearsWithCurrManager = scale(YearsWithCurrManager))

nz <- dim(employees_z)[2]
employees_z <- employees_z[,c((n+1):nz)]
employees_z$Attrition = employees$Attrition
#get rid of nans
colSums(is.na(employees_z))
##                                                                                 
##         0         0         0         0         0         0         0       870 
##                                                                                 
##         0         0         0         0         0         0         0         0 
##                                                                                 
##         0         0         0         0         0         0         0         0 
##                                                                                 
##       870         0         0         0         0         0         0         0 
##           Attrition 
##         0         0
employees_z$zStandardHours <- NULL
employees_z$zEmployeeCount <- NULL
str(employees_z)
## 'data.frame':    870 obs. of  32 variables:
##  $ zAge                     : num [1:870, 1] -0.541 0.355 -0.205 -0.541 -1.437 ...
##   ..- attr(*, "scaled:center")= num 36.8
##   ..- attr(*, "scaled:scale")= num 8.93
##  $ zBusinessTravel          : num [1:870, 1] 0.589 0.589 -0.892 0.589 -0.892 ...
##   ..- attr(*, "scaled:center")= num 2.6
##   ..- attr(*, "scaled:scale")= num 0.675
##  $ zDailyRate               : num [1:870, 1] -1.7407 1.2285 -1.5338 -0.0355 -0.6188 ...
##   ..- attr(*, "scaled:center")= num 815
##   ..- attr(*, "scaled:scale")= num 401
##  $ zDepartment              : num [1:870, 1] 1.374 -0.517 -0.517 1.374 -0.517 ...
##   ..- attr(*, "scaled:center")= num 2.27
##   ..- attr(*, "scaled:scale")= num 0.529
##  $ zDistanceFromHome        : num [1:870, 1] 0.45 0.573 1.064 -1.025 -0.902 ...
##   ..- attr(*, "scaled:center")= num 9.34
##   ..- attr(*, "scaled:scale")= num 8.14
##  $ zEducation               : num [1:870, 1] 1.0741 0.0966 -0.8809 1.0741 -1.8583 ...
##   ..- attr(*, "scaled:center")= num 2.9
##   ..- attr(*, "scaled:scale")= num 1.02
##  $ zEducationField          : num [1:870, 1] -0.94 0.573 -0.94 -0.183 2.085 ...
##   ..- attr(*, "scaled:center")= num 3.24
##   ..- attr(*, "scaled:scale")= num 1.32
##  $ zEmployeeNumber          : num [1:870, 1] -0.282 0.162 0.632 1.631 1.019 ...
##   ..- attr(*, "scaled:center")= num 1030
##   ..- attr(*, "scaled:scale")= num 605
##  $ zEnvironmentSatisfaction : num [1:870, 1] -0.638 0.272 0.272 0.272 -1.547 ...
##   ..- attr(*, "scaled:center")= num 2.7
##   ..- attr(*, "scaled:scale")= num 1.1
##  $ zGender                  : num [1:870, 1] 0.828 0.828 0.828 -1.207 -1.207 ...
##   ..- attr(*, "scaled:center")= num 1.59
##   ..- attr(*, "scaled:scale")= num 0.492
##  $ zHourlyRate              : num [1:870, 1] 0.367 -1.074 -0.279 -0.875 -1.67 ...
##   ..- attr(*, "scaled:center")= num 65.6
##   ..- attr(*, "scaled:scale")= num 20.1
##  $ zJobInvolvement          : num [1:870, 1] 0.394 -1.028 0.394 0.394 0.394 ...
##   ..- attr(*, "scaled:center")= num 2.72
##   ..- attr(*, "scaled:scale")= num 0.704
##  $ zJobLevel                : num [1:870, 1] -0.0358 2.7162 0.8815 0.8815 -0.9532 ...
##   ..- attr(*, "scaled:center")= num 2.04
##   ..- attr(*, "scaled:scale")= num 1.09
##  $ zJobRole                 : num [1:870, 1] 1.002 0.189 -0.217 1.002 0.596 ...
##   ..- attr(*, "scaled:center")= num 5.53
##   ..- attr(*, "scaled:scale")= num 2.46
##  $ zJobSatisfaction         : num [1:870, 1] 1.158 0.261 1.158 1.158 1.158 ...
##   ..- attr(*, "scaled:center")= num 2.71
##   ..- attr(*, "scaled:scale")= num 1.11
##  $ zMaritalStatus           : num [1:870, 1] -1.509 1.261 1.261 -0.124 1.261 ...
##   ..- attr(*, "scaled:center")= num 2.09
##   ..- attr(*, "scaled:scale")= num 0.722
##  $ zMonthlyIncome           : num [1:870, 1] -0.432 2.879 0.646 0.877 -0.572 ...
##   ..- attr(*, "scaled:center")= num 6390
##   ..- attr(*, "scaled:scale")= num 4598
##  $ zMonthlyRate             : num [1:870, 1] -0.714 0.453 0.79 1.365 0.407 ...
##   ..- attr(*, "scaled:center")= num 14326
##   ..- attr(*, "scaled:scale")= num 7108
##  $ zNumCompaniesWorked      : num [1:870, 1] -0.289 -0.685 -0.289 -0.685 -0.685 ...
##   ..- attr(*, "scaled:center")= num 2.73
##   ..- attr(*, "scaled:scale")= num 2.52
##  $ zOverTime                : num [1:870, 1] -0.638 -0.638 -0.638 -0.638 1.565 ...
##   ..- attr(*, "scaled:center")= num 1.29
##   ..- attr(*, "scaled:scale")= num 0.454
##  $ zPercentSalaryHike       : num [1:870, 1] -1.143 -0.326 -1.143 1.034 -0.599 ...
##   ..- attr(*, "scaled:center")= num 15.2
##   ..- attr(*, "scaled:scale")= num 3.68
##  $ zPerformanceRating       : num [1:870, 1] -0.423 -0.423 -0.423 -0.423 -0.423 ...
##   ..- attr(*, "scaled:center")= num 3.15
##   ..- attr(*, "scaled:scale")= num 0.359
##  $ zRelationshipSatisfaction: num [1:870, 1] 0.266 -1.548 0.266 0.266 0.266 ...
##   ..- attr(*, "scaled:center")= num 2.71
##   ..- attr(*, "scaled:scale")= num 1.1
##  $ zStockOptionLevel        : num [1:870, 1] 0.252 -0.914 -0.914 1.418 -0.914 ...
##   ..- attr(*, "scaled:center")= num 0.784
##   ..- attr(*, "scaled:scale")= num 0.858
##  $ zTotalWorkingYears       : num [1:870, 1] -0.406 1.324 -0.14 0.392 -0.672 ...
##   ..- attr(*, "scaled:center")= num 11.1
##   ..- attr(*, "scaled:scale")= num 7.51
##  $ zTrainingTimesLastYear   : num [1:870, 1] 0.132 -0.654 -0.654 0.132 -0.654 ...
##   ..- attr(*, "scaled:center")= num 2.83
##   ..- attr(*, "scaled:scale")= num 1.27
##  $ zWorkLifeBalance         : num [1:870, 1] -1.098 1.711 0.307 0.307 0.307 ...
##   ..- attr(*, "scaled:center")= num 2.78
##   ..- attr(*, "scaled:scale")= num 0.712
##  $ zYearsAtCompany          : num [1:870, 1] -0.326 2.165 -0.824 1.169 -0.16 ...
##   ..- attr(*, "scaled:center")= num 6.96
##   ..- attr(*, "scaled:scale")= num 6.02
##  $ zYearsInCurrentRole      : num [1:870, 1] -0.606 0.768 -0.606 1.592 -0.331 ...
##   ..- attr(*, "scaled:center")= num 4.2
##   ..- attr(*, "scaled:scale")= num 3.64
##  $ zYearsSinceLastPromotion : num [1:870, 1] -0.681 0.575 -0.053 0.889 -0.367 ...
##   ..- attr(*, "scaled:center")= num 2.17
##   ..- attr(*, "scaled:scale")= num 3.19
##  $ zYearsWithCurrManager    : num [1:870, 1] -0.319 1.36 -0.599 0.8 -0.319 ...
##   ..- attr(*, "scaled:center")= num 4.14
##   ..- attr(*, "scaled:scale")= num 3.57
##  $ Attrition                : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...

Predict attrition - KNN

#KNN
##load the package class
 library(class)

train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]

##run knn k=8 and make ROC--it looks really bad
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=8, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
plot(roc.perf_knn, colorize = TRUE)

auc.knns <- c()
for(i in 1:80) {
#train test split
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]

##get auc of knn
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=i, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
}
plot(x=1:80, y=auc.knns, xlab="k", main = "AUCs of KNN models with k=[1,80]")

#all the KNN models are bad

#second time, this time with scaled variables

train <- split_train_test(employees_z)[[1]]
test <- split_train_test(employees_z)[[2]]

str(train)
## 'data.frame':    161 obs. of  32 variables:
##  $ zAge                     : num [1:161, 1] -0.541 0.355 1.027 -0.317 0.131 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zBusinessTravel          : num [1:161, 1] 0.589 0.589 0.589 0.589 0.589 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zDailyRate               : num [1:161, 1] -1.741 1.229 0.438 -1.581 1.071 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zDepartment              : num [1:161, 1] 1.374 -0.517 -2.409 -0.517 1.374 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zDistanceFromHome        : num [1:161, 1] 0.45 0.573 -1.025 -0.902 0.573 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zEducation               : num [1:161, 1] 1.0741 0.0966 -0.8809 1.0741 0.0966 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zEducationField          : num [1:161, 1] -0.94 0.573 -0.94 0.573 -0.94 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zEmployeeNumber          : num [1:161, 1] -0.282 0.162 0.47 1.199 0.913 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zEnvironmentSatisfaction : num [1:161, 1] -0.638 0.272 1.181 1.181 0.272 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zGender                  : num [1:161, 1] 0.828 0.828 -1.207 0.828 0.828 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zHourlyRate              : num [1:161, 1] 0.367 -1.074 -1.074 1.559 0.715 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zJobInvolvement          : num [1:161, 1] 0.394 -1.028 0.394 1.815 0.394 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zJobLevel                : num [1:161, 1] -0.0358 2.7162 -0.9532 -0.9532 -0.0358 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zJobRole                 : num [1:161, 1] 1.002 0.189 -1.437 0.596 1.002 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zJobSatisfaction         : num [1:161, 1] 1.158 0.261 -1.534 1.158 -0.636 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zMaritalStatus           : num [1:161, 1] -1.509 1.261 1.261 -0.124 -0.124 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zMonthlyIncome           : num [1:161, 1] -0.432 2.879 -0.645 -0.752 0.769 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zMonthlyRate             : num [1:161, 1] -0.714 0.453 1.214 -1.229 -0.277 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zNumCompaniesWorked      : num [1:161, 1] -0.289 -0.685 1.298 -1.082 -1.082 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zOverTime                : num [1:161, 1] -0.638 -0.638 -0.638 1.565 -0.638 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zPercentSalaryHike       : num [1:161, 1] -1.143 -0.326 -0.871 -0.326 -1.143 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zPerformanceRating       : num [1:161, 1] -0.423 -0.423 -0.423 -0.423 -0.423 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zRelationshipSatisfaction: num [1:161, 1] 0.266 -1.548 0.266 -1.548 1.173 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zStockOptionLevel        : num [1:161, 1] 0.252 -0.914 -0.914 2.584 0.252 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zTotalWorkingYears       : num [1:161, 1] -0.406 1.324 -0.14 -0.672 -0.14 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zTrainingTimesLastYear   : num [1:161, 1] 0.132 -0.654 0.132 0.132 0.132 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zWorkLifeBalance         : num [1:161, 1] -1.098 1.711 1.711 0.307 0.307 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zYearsAtCompany          : num [1:161, 1] -0.3259 2.1654 0.0063 -0.3259 0.3385 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zYearsInCurrentRole      : num [1:161, 1] -0.606 0.768 0.493 -1.155 1.043 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zYearsSinceLastPromotion : num [1:161, 1] -0.681 0.575 0.889 -0.367 1.516 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zYearsWithCurrManager    : num [1:161, 1] -0.319 1.36 0.8 -0.599 0.8 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ Attrition                : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
str(test)
## 'data.frame':    434 obs. of  32 variables:
##  $ zAge                     : num [1:434, 1] -0.541 -0.541 -0.0928 1.6997 2.1478 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zBusinessTravel          : num [1:434, 1] 0.589 0.589 0.589 -2.372 0.589 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zDailyRate               : num [1:434, 1] -1.7407 -0.0355 0.2438 -0.2499 -1.5188 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zDepartment              : num [1:434, 1] 1.374 1.374 -0.517 -0.517 -2.409 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zDistanceFromHome        : num [1:434, 1] 0.4499 -1.0249 -0.0417 1.1873 -0.1646 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zEducation               : num [1:434, 1] 1.074 1.074 -0.881 1.074 1.074 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zEducationField          : num [1:434, 1] -0.94 -0.183 0.573 0.573 -0.94 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zEmployeeNumber          : num [1:434, 1] -0.282 1.631 -0.547 -0.395 0.51 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zEnvironmentSatisfaction : num [1:434, 1] -0.638 0.272 -0.638 1.181 1.181 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zGender                  : num [1:434, 1] 0.828 -1.207 0.828 0.828 0.828 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zHourlyRate              : num [1:434, 1] 0.367 -0.875 -0.875 -1.223 1.659 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zJobInvolvement          : num [1:434, 1] 0.394 0.394 -1.028 0.394 0.394 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zJobLevel                : num [1:434, 1] -0.0358 0.8815 -0.0358 -0.9532 2.7162 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zJobRole                 : num [1:434, 1] 1.002 1.002 -0.217 0.596 -0.624 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zJobSatisfaction         : num [1:434, 1] 1.158 1.158 -0.636 1.158 -0.636 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zMaritalStatus           : num [1:434, 1] -1.509 -0.124 -1.509 -0.124 1.261 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zMonthlyIncome           : num [1:434, 1] -0.432 0.877 0.534 -0.464 2.899 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zMonthlyRate             : num [1:434, 1] -0.714 1.3655 -0.0551 1.7252 -1.4495 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zNumCompaniesWorked      : num [1:434, 1] -0.289 -0.685 -0.289 -1.082 1.298 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zOverTime                : num [1:434, 1] -0.638 -0.638 1.565 -0.638 -0.638 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zPercentSalaryHike       : num [1:434, 1] -1.143 1.034 -1.143 0.762 -0.326 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zPerformanceRating       : num [1:434, 1] -0.423 -0.423 -0.423 -0.423 -0.423 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zRelationshipSatisfaction: num [1:434, 1] 0.266 0.266 0.266 -1.548 -1.548 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zStockOptionLevel        : num [1:434, 1] 0.252 1.418 0.252 0.252 -0.914 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zTotalWorkingYears       : num [1:434, 1] -0.406 0.392 0.259 -0.806 3.32 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zTrainingTimesLastYear   : num [1:434, 1] 0.132 0.132 -0.654 0.132 0.918 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zWorkLifeBalance         : num [1:434, 1] -1.098 0.307 0.307 0.307 0.307 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zYearsAtCompany          : num [1:434, 1] -0.3259 1.1689 -0.658 -0.492 0.0063 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zYearsInCurrentRole      : num [1:434, 1] -0.606 1.592 -0.606 -0.331 -0.331 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zYearsSinceLastPromotion : num [1:434, 1] -0.681 0.889 -0.681 -0.367 1.516 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ zYearsWithCurrManager    : num [1:434, 1] -0.319 0.8 -0.599 -0.599 0.8 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : NULL
##  $ Attrition                : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##run knn k=8 and make ROC
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=as.factor(train$Attrition), k=8, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
plot(roc.perf_knn, colorize = TRUE)

auc.knns <- c()
for(i in 1:80) {
#train test split
train <- split_train_test(employees_num)[[1]]
test <- split_train_test(employees_num)[[2]]

##get auc of knn for many ks
preds.knn <- knn(train[, names(train) != "Attrition"], test[, names(train) != "Attrition"], cl=train$Attrition, k=i, prob=TRUE)
prob.knn <- attr(preds.knn, "prob")
preds.knn <- prediction(prob.knn, test$Attrition)
roc.perf_knn = performance(preds.knn, measure = "tpr", x.measure = "fpr")
auc <- performance(preds.knn, measure = "auc")
auc <- auc@y.values
auc.knns <- c(auc.knns, auc)
}
plot(x=1:80, y=auc.knns, xlab="k", main = "AUCs of KNN models with k=[1,80]")

#they're all still bad
#RF
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
train <- split_train_test(employees)[[1]]
test <- split_train_test(employees)[[2]]

model.rf <- randomForest(Attrition ~ ., data = train, importance = TRUE)
preds.rf <- predict(model.rf, test)

confusionMatrix(table(preds.rf, test$Attrition), positive = "Yes")
## Confusion Matrix and Statistics
## 
##         
## preds.rf  No Yes
##      No  321  15
##      Yes  44  55
##                                           
##                Accuracy : 0.8644          
##                  95% CI : (0.8286, 0.8951)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.0831147       
##                                           
##                   Kappa : 0.5698          
##                                           
##  Mcnemar's Test P-Value : 0.0002671       
##                                           
##             Sensitivity : 0.7857          
##             Specificity : 0.8795          
##          Pos Pred Value : 0.5556          
##          Neg Pred Value : 0.9554          
##              Prevalence : 0.1609          
##          Detection Rate : 0.1264          
##    Detection Prevalence : 0.2276          
##       Balanced Accuracy : 0.8326          
##                                           
##        'Positive' Class : Yes             
## 
prob.rf <- predict(model.rf, test, type = "prob")
preds.rf <- prediction(prob.rf[,2], test$Attrition)
roc.rf = performance(preds.rf, measure = "tpr", x.measure = "fpr")

#the balanced one definitely looks better and .25 seems a good threshold
plot(roc.perf_1, col="red")
plot(roc.rf, add = TRUE, col="blue")

Random forest wins!

Regression time

What continuous variables have a difference in distribution of the Monthly Income? pairs plots continuous

library(GGally)

employees %>%
  ggplot(aes(x=MonthlyIncome)) +
  geom_histogram()

employees %>%
  select_if(is.numeric) %>%
  select(1:5) %>%
  mutate(MonthlyIncome = employees$MonthlyIncome) %>%
  sample_n(200) %>%
  ggpairs() + 
  ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

employees %>%
  select_if(is.numeric) %>%
  select(6:10) %>%
  mutate(MonthlyIncome = employees$MonthlyIncome) %>%
  sample_n(200) %>%
  ggpairs() + 
  ggtitle("Pairs Plot")

employees %>%
  select_if(is.numeric) %>%
  select(11:15) %>%
  mutate(MonthlyIncome = employees$MonthlyIncome) %>%
  sample_n(200) %>%
  ggpairs() + 
  ggtitle("Pairs Plot")
## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

## Warning in cor(x, y, method = method, use = use): the standard deviation is zero

employees %>%
  select_if(is.numeric) %>%
  select(16:19) %>%
  mutate(MonthlyIncome = employees$MonthlyIncome) %>%
  sample_n(200) %>%
  ggpairs() + 
  ggtitle("Pairs Plot")

What categorical variables have a difference in distribution in response?

#there's 16 categorical variables
employees %>%
  select_if(is.factor) %>%
  dim()
## [1] 870  16
categs <- names(select_if(employees, is.factor))

for(i in 1:length(categs)){
  print(employees %>%
    ggplot(aes(x = eval(parse(text=categs[i])), y=MonthlyIncome)) +
    geom_boxplot() +
    xlab(categs[i])
  )
}

Make train test split

folds <- createFolds(employees$ID, k=2)
train <- employees[folds$Fold1,]
test <- employees[folds$Fold2,]

manually selected variables based on EDA plots

#make train test split
folds <- createFolds(employees$ID, k=2)
train <- employees[folds$Fold1,]
test <- employees[folds$Fold2,]

#fit the manual model
lm.manual <- lm(MonthlyIncome ~ Age + EmployeeNumber + NumCompaniesWorked +
                  TotalWorkingYears + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + Attrition + BusinessTravel + Department + Education + EducationField + JobRole +  StockOptionLevel, data=train)

#check residuals and summary
plot(lm.manual)

summary(lm.manual)
## 
## Call:
## lm(formula = MonthlyIncome ~ Age + EmployeeNumber + NumCompaniesWorked + 
##     TotalWorkingYears + YearsAtCompany + YearsInCurrentRole + 
##     YearsSinceLastPromotion + Attrition + BusinessTravel + Department + 
##     Education + EducationField + JobRole + StockOptionLevel, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4218.5 -1044.3  -155.5   933.2  4796.3 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       5.156e+03  1.098e+03   4.695 3.66e-06 ***
## Age                              -5.492e+00  1.200e+01  -0.458  0.64734    
## EmployeeNumber                   -2.044e-02  1.358e-01  -0.151  0.88041    
## NumCompaniesWorked               -3.731e+01  3.703e+01  -1.007  0.31431    
## TotalWorkingYears                 1.815e+02  2.132e+01   8.514 3.39e-16 ***
## YearsAtCompany                    2.902e+00  2.843e+01   0.102  0.91874    
## YearsInCurrentRole                3.750e+01  3.643e+01   1.030  0.30386    
## YearsSinceLastPromotion           1.394e+01  3.421e+01   0.407  0.68388    
## AttritionYes                      9.980e+01  2.427e+02   0.411  0.68117    
## BusinessTravelTravel_Frequently   4.729e+02  3.067e+02   1.542  0.12382    
## BusinessTravelTravel_Rarely       4.331e+02  2.553e+02   1.696  0.09061 .  
## DepartmentResearch & Development -3.796e+02  9.358e+02  -0.406  0.68521    
## DepartmentSales                  -6.646e+02  1.001e+03  -0.664  0.50720    
## EducationBelow College            1.043e+02  2.739e+02   0.381  0.70364    
## EducationCollege                  1.660e+02  2.236e+02   0.742  0.45830    
## EducationDoctor                   3.435e+02  5.081e+02   0.676  0.49938    
## EducationMaster                   1.316e+02  2.069e+02   0.636  0.52517    
## EducationFieldLife Sciences      -7.779e+01  7.846e+02  -0.099  0.92108    
## EducationFieldMarketing           6.922e+01  8.448e+02   0.082  0.93474    
## EducationFieldMedical            -1.813e+02  7.993e+02  -0.227  0.82067    
## EducationFieldOther               2.744e+02  8.467e+02   0.324  0.74602    
## EducationFieldTechnical Degree   -1.801e+01  8.350e+02  -0.022  0.98281    
## JobRoleHuman Resources           -3.253e+03  1.045e+03  -3.114  0.00198 ** 
## JobRoleLaboratory Technician     -3.441e+03  3.462e+02  -9.940  < 2e-16 ***
## JobRoleManager                    7.963e+03  5.583e+02  14.262  < 2e-16 ***
## JobRoleManufacturing Director    -7.337e+00  3.725e+02  -0.020  0.98430    
## JobRoleResearch Director          6.417e+03  4.477e+02  14.332  < 2e-16 ***
## JobRoleResearch Scientist        -3.237e+03  3.402e+02  -9.515  < 2e-16 ***
## JobRoleSales Executive           -1.897e+01  8.344e+02  -0.023  0.98187    
## JobRoleSales Representative      -2.827e+03  9.154e+02  -3.088  0.00215 ** 
## StockOptionLevel1                 4.526e+00  1.774e+02   0.026  0.97966    
## StockOptionLevel2                -9.085e+01  3.105e+02  -0.293  0.77002    
## StockOptionLevel3                 2.434e+01  3.233e+02   0.075  0.94004    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1633 on 402 degrees of freedom
## Multiple R-squared:  0.8806, Adjusted R-squared:  0.8711 
## F-statistic: 92.68 on 32 and 402 DF,  p-value: < 2.2e-16
#prediction error
preds.lm.manual <- predict(lm.manual, test)
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.lm.manual, test$MonthlyIncome)
## [1] 1691.034

full model

#make train test split
folds <- createFolds(employees$ID, k=2)
train <- employees[folds$Fold1,]
test <- employees[folds$Fold2,]

#fit the model
lm.full <- lm(MonthlyIncome ~ ., data=train)

#check residuals and summary
plot(lm.full)

summary(lm.full)
## 
## Call:
## lm(formula = MonthlyIncome ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3229.1  -643.1   -52.6   550.6  3449.0 
## 
## Coefficients: (2 not defined because of singularities)
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -1.476e+03  1.046e+03  -1.411  0.15909    
## ID                                -2.085e-01  2.246e-01  -0.928  0.35386    
## Age                                1.070e+01  8.408e+00   1.272  0.20413    
## AttritionYes                       3.140e+01  1.813e+02   0.173  0.86260    
## BusinessTravelTravel_Frequently    2.094e+02  2.095e+02   0.999  0.31831    
## BusinessTravelTravel_Rarely        2.844e+02  1.775e+02   1.603  0.10986    
## DailyRate                          1.586e-01  1.404e-01   1.129  0.25943    
## DepartmentResearch & Development   4.316e+01  8.165e+02   0.053  0.95787    
## DepartmentSales                   -8.244e+01  8.488e+02  -0.097  0.92268    
## DistanceFromHome                  -3.705e+00  6.610e+00  -0.561  0.57539    
## EducationBelow College             1.369e+02  1.838e+02   0.745  0.45676    
## EducationCollege                  -8.993e+01  1.490e+02  -0.603  0.54656    
## EducationDoctor                   -7.145e+02  3.585e+02  -1.993  0.04696 *  
## EducationMaster                    6.158e+01  1.447e+02   0.426  0.67070    
## EducationFieldLife Sciences        1.218e+02  5.859e+02   0.208  0.83542    
## EducationFieldMarketing           -5.302e+01  6.236e+02  -0.085  0.93229    
## EducationFieldMedical             -8.336e+01  5.899e+02  -0.141  0.88770    
## EducationFieldOther                5.816e+01  6.159e+02   0.094  0.92481    
## EducationFieldTechnical Degree     1.667e+02  6.060e+02   0.275  0.78342    
## EmployeeCount                             NA         NA      NA       NA    
## EmployeeNumber                     7.222e-02  8.944e-02   0.808  0.41987    
## EnvironmentSatisfactionLow        -3.197e+01  1.667e+02  -0.192  0.84807    
## EnvironmentSatisfactionMedium     -1.972e+02  1.544e+02  -1.277  0.20250    
## EnvironmentSatisfactionVery High  -9.328e+01  1.411e+02  -0.661  0.50910    
## GenderMale                         2.197e+02  1.110e+02   1.980  0.04847 *  
## HourlyRate                        -1.217e+00  2.694e+00  -0.452  0.65179    
## JobInvolvementLow                  2.751e+02  2.530e+02   1.087  0.27755    
## JobInvolvementMedium               1.048e+02  1.267e+02   0.827  0.40873    
## JobInvolvementVery High            5.503e+02  2.109e+02   2.610  0.00943 ** 
## JobLevel                           2.763e+03  1.273e+02  21.700  < 2e-16 ***
## JobRoleHuman Resources             1.496e+02  8.536e+02   0.175  0.86100    
## JobRoleLaboratory Technician      -4.140e+02  2.578e+02  -1.606  0.10918    
## JobRoleManager                     4.736e+03  4.045e+02  11.710  < 2e-16 ***
## JobRoleManufacturing Director      2.966e+02  2.596e+02   1.143  0.25394    
## JobRoleResearch Director           4.748e+03  3.246e+02  14.624  < 2e-16 ***
## JobRoleResearch Scientist         -2.176e+02  2.617e+02  -0.832  0.40618    
## JobRoleSales Executive             3.929e+02  5.502e+02   0.714  0.47557    
## JobRoleSales Representative       -6.878e+01  5.910e+02  -0.116  0.90741    
## JobSatisfactionLow                -9.248e+00  1.607e+02  -0.058  0.95413    
## JobSatisfactionMedium             -1.576e+02  1.595e+02  -0.989  0.32352    
## JobSatisfactionVery High           1.619e+01  1.392e+02   0.116  0.90746    
## MaritalStatusMarried               3.127e+02  1.521e+02   2.056  0.04046 *  
## MaritalStatusSingle                2.415e+02  2.355e+02   1.025  0.30584    
## MonthlyRate                       -1.750e-02  7.743e-03  -2.260  0.02439 *  
## NumCompaniesWorked                -3.491e+00  2.657e+01  -0.131  0.89552    
## OverTimeYes                       -4.624e+01  1.272e+02  -0.363  0.71649    
## PercentSalaryHike                 -1.044e+00  2.355e+01  -0.044  0.96466    
## PerformanceRatingOutstanding      -2.740e+02  2.389e+02  -1.147  0.25219    
## RelationshipSatisfactionLow        3.948e+00  1.553e+02   0.025  0.97973    
## RelationshipSatisfactionMedium     1.855e+02  1.564e+02   1.186  0.23623    
## RelationshipSatisfactionVery High  1.652e+02  1.405e+02   1.176  0.24042    
## StandardHours                             NA         NA      NA       NA    
## StockOptionLevel1                 -4.679e+01  1.783e+02  -0.262  0.79314    
## StockOptionLevel2                 -2.285e+01  2.431e+02  -0.094  0.92517    
## StockOptionLevel3                  3.941e+02  2.691e+02   1.465  0.14390    
## TotalWorkingYears                  3.271e+01  1.652e+01   1.980  0.04848 *  
## TrainingTimesLastYear              9.897e+01  4.583e+01   2.160  0.03144 *  
## WorkLifeBalanceBest                2.146e+01  2.711e+02   0.079  0.93694    
## WorkLifeBalanceBetter              3.510e+02  2.360e+02   1.487  0.13782    
## WorkLifeBalanceGood                3.099e+02  2.546e+02   1.217  0.22428    
## YearsAtCompany                     1.972e+00  2.106e+01   0.094  0.92546    
## YearsInCurrentRole                 9.280e+00  2.647e+01   0.351  0.72611    
## YearsSinceLastPromotion            5.119e+01  2.394e+01   2.138  0.03314 *  
## YearsWithCurrManager              -4.604e+01  2.511e+01  -1.833  0.06754 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1067 on 374 degrees of freedom
## Multiple R-squared:  0.953,  Adjusted R-squared:  0.9453 
## F-statistic: 124.2 on 61 and 374 DF,  p-value: < 2.2e-16
#prediction error
preds.lm.full <- predict(lm.full, test)
## Warning in predict.lm(lm.full, test): prediction from a rank-deficient fit may
## be misleading
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.lm.full, test$MonthlyIncome)
## [1] 1143.698

LASSO

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 3.0-2
#make train test split
folds <- createFolds(employees$ID, k=2)
train <- employees[folds$Fold1,]
test <- employees[folds$Fold2,]

#find the lasso variable choices
x_vars <- model.matrix(MonthlyIncome~. , employees)[,-1]
cvfit <- cv.glmnet(x_vars, employees$MonthlyIncome)
coef(cvfit, s = "lambda.1se")
## 64 x 1 sparse Matrix of class "dgCMatrix"
##                                            1
## (Intercept)                       -567.21892
## ID                                   .      
## Age                                  .      
## AttritionYes                         .      
## BusinessTravelTravel_Frequently      .      
## BusinessTravelTravel_Rarely          .      
## DailyRate                            .      
## DepartmentResearch & Development     .      
## DepartmentSales                      .      
## DistanceFromHome                     .      
## EducationBelow College               .      
## EducationCollege                     .      
## EducationDoctor                      .      
## EducationMaster                      .      
## EducationFieldLife Sciences          .      
## EducationFieldMarketing              .      
## EducationFieldMedical                .      
## EducationFieldOther                  .      
## EducationFieldTechnical Degree       .      
## EmployeeCount                        .      
## EmployeeNumber                       .      
## EnvironmentSatisfactionLow           .      
## EnvironmentSatisfactionMedium        .      
## EnvironmentSatisfactionVery High     .      
## GenderMale                           .      
## HourlyRate                           .      
## JobInvolvementLow                    .      
## JobInvolvementMedium                 .      
## JobInvolvementVery High              .      
## JobLevel                          3045.71722
## JobRoleHuman Resources               .      
## JobRoleLaboratory Technician       -41.93937
## JobRoleManager                    3154.94800
## JobRoleManufacturing Director        .      
## JobRoleResearch Director          3208.83434
## JobRoleResearch Scientist            .      
## JobRoleSales Executive               .      
## JobRoleSales Representative          .      
## JobSatisfactionLow                   .      
## JobSatisfactionMedium                .      
## JobSatisfactionVery High             .      
## MaritalStatusMarried                 .      
## MaritalStatusSingle                  .      
## MonthlyRate                          .      
## NumCompaniesWorked                   .      
## OverTimeYes                          .      
## PercentSalaryHike                    .      
## PerformanceRatingOutstanding         .      
## RelationshipSatisfactionLow          .      
## RelationshipSatisfactionMedium       .      
## RelationshipSatisfactionVery High    .      
## StandardHours                        .      
## StockOptionLevel1                    .      
## StockOptionLevel2                    .      
## StockOptionLevel3                    .      
## TotalWorkingYears                   34.50207
## TrainingTimesLastYear                .      
## WorkLifeBalanceBest                  .      
## WorkLifeBalanceBetter                .      
## WorkLifeBalanceGood                  .      
## YearsAtCompany                       .      
## YearsInCurrentRole                   .      
## YearsSinceLastPromotion              .      
## YearsWithCurrManager                 .
#fit the model
lm.lasso <- lm(MonthlyIncome ~ Attrition+BusinessTravel+ DistanceFromHome + Education + EmployeeNumber + EnvironmentSatisfaction + Gender + JobLevel + JobRole + MonthlyRate + RelationshipSatisfaction+ TotalWorkingYears + YearsInCurrentRole + YearsSinceLastPromotion + YearsWithCurrManager, data=train)

#check residuals and summary
plot(lm.lasso)

summary(lm.lasso)
## 
## Call:
## lm(formula = MonthlyIncome ~ Attrition + BusinessTravel + DistanceFromHome + 
##     Education + EmployeeNumber + EnvironmentSatisfaction + Gender + 
##     JobLevel + JobRole + MonthlyRate + RelationshipSatisfaction + 
##     TotalWorkingYears + YearsInCurrentRole + YearsSinceLastPromotion + 
##     YearsWithCurrManager, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3464.8  -670.5   -16.2   624.8  4236.8 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       -4.906e+02  3.817e+02  -1.285   0.1995    
## AttritionYes                      -1.943e+01  1.458e+02  -0.133   0.8940    
## BusinessTravelTravel_Frequently    1.193e+02  2.101e+02   0.568   0.5705    
## BusinessTravelTravel_Rarely        4.425e+02  1.840e+02   2.405   0.0166 *  
## DistanceFromHome                  -3.077e+00  6.693e+00  -0.460   0.6460    
## EducationBelow College             1.965e+02  1.801e+02   1.091   0.2758    
## EducationCollege                  -2.074e+02  1.469e+02  -1.411   0.1589    
## EducationDoctor                   -1.890e+02  3.480e+02  -0.543   0.5873    
## EducationMaster                   -7.534e+01  1.332e+02  -0.566   0.5720    
## EmployeeNumber                     8.797e-02  9.003e-02   0.977   0.3291    
## EnvironmentSatisfactionLow         6.148e+01  1.532e+02   0.401   0.6884    
## EnvironmentSatisfactionMedium      7.514e+01  1.556e+02   0.483   0.6295    
## EnvironmentSatisfactionVery High  -7.540e+01  1.404e+02  -0.537   0.5914    
## GenderMale                         1.682e+02  1.099e+02   1.530   0.1268    
## JobLevel                           2.666e+03  1.226e+02  21.752  < 2e-16 ***
## JobRoleHuman Resources            -3.517e+02  3.782e+02  -0.930   0.3531    
## JobRoleLaboratory Technician      -6.199e+02  2.425e+02  -2.556   0.0110 *  
## JobRoleManager                     4.288e+03  3.349e+02  12.804  < 2e-16 ***
## JobRoleManufacturing Director      3.321e+02  2.672e+02   1.243   0.2145    
## JobRoleResearch Director           4.313e+03  3.119e+02  13.830  < 2e-16 ***
## JobRoleResearch Scientist         -4.133e+02  2.455e+02  -1.683   0.0931 .  
## JobRoleSales Executive             1.208e+02  2.132e+02   0.567   0.5712    
## JobRoleSales Representative       -3.390e+02  3.055e+02  -1.110   0.2679    
## MonthlyRate                       -3.127e-03  7.355e-03  -0.425   0.6710    
## RelationshipSatisfactionLow        1.550e+02  1.507e+02   1.028   0.3044    
## RelationshipSatisfactionMedium     5.306e+01  1.619e+02   0.328   0.7433    
## RelationshipSatisfactionVery High  1.508e+02  1.380e+02   1.092   0.2753    
## TotalWorkingYears                  5.812e+01  1.278e+01   4.549 7.13e-06 ***
## YearsInCurrentRole                 1.776e+01  2.074e+01   0.856   0.3925    
## YearsSinceLastPromotion            2.466e+01  2.197e+01   1.123   0.2623    
## YearsWithCurrManager              -4.598e+01  2.050e+01  -2.243   0.0255 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1075 on 403 degrees of freedom
## Multiple R-squared:  0.9464, Adjusted R-squared:  0.9424 
## F-statistic:   237 on 30 and 403 DF,  p-value: < 2.2e-16
#prediction error
preds.lm.lasso <- predict(lm.lasso, test)
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.lm.lasso, test$MonthlyIncome)
## [1] 1062.767

Forward step model

#make train test split
folds <- createFolds(employees$ID, k=2)
train <- employees[folds$Fold1,]
test <- employees[folds$Fold2,]

#find the forward selection variables
model.null<-lm(MonthlyIncome ~ 1, data=employees)
model.complex <- lm(MonthlyIncome ~ (Age + EmployeeNumber + NumCompaniesWorked +
                  TotalWorkingYears + YearsAtCompany + YearsInCurrentRole + YearsSinceLastPromotion + Attrition + BusinessTravel + Department + Education + EducationField + JobRole +  StockOptionLevel)^2 + ., data=train)
model.forward <- step(model.null,
                   scope = list(upper=model.complex),
                   direction="forward",
                   data=employees)
## Start:  AIC=14674.96
## MonthlyIncome ~ 1
## 
##                            Df  Sum of Sq        RSS   AIC
## + JobLevel                  1 1.6636e+10 1.7337e+09 12623
## + JobRole                   8 1.4888e+10 3.4814e+09 13244
## + TotalWorkingYears         1 1.1133e+10 7.2362e+09 13866
## + YearsAtCompany            1 4.4354e+09 1.3934e+10 14436
## + Age                       1 4.3083e+09 1.4061e+10 14444
## + YearsInCurrentRole        1 2.4051e+09 1.5965e+10 14555
## + YearsWithCurrManager      1 1.9822e+09 1.6387e+10 14578
## + YearsSinceLastPromotion   1 1.8333e+09 1.6536e+10 14586
## + NumCompaniesWorked        1 4.4644e+08 1.7923e+10 14656
## + Attrition                 1 4.4085e+08 1.7929e+10 14656
## + Education                 4 3.5197e+08 1.8018e+10 14666
## + MaritalStatus             2 1.9261e+08 1.8177e+10 14670
## + MonthlyRate               1 7.6645e+07 1.8293e+10 14673
## + StockOptionLevel          3 1.5399e+08 1.8216e+10 14674
## + Gender                    1 5.5886e+07 1.8314e+10 14674
## + PercentSalaryHike         1 5.3300e+07 1.8316e+10 14674
## + ID                        1 4.8984e+07 1.8321e+10 14675
## <none>                                   1.8370e+10 14675
## + PerformanceRating         1 3.4173e+07 1.8335e+10 14675
## + Department                2 7.5183e+07 1.8294e+10 14675
## + BusinessTravel            2 7.3567e+07 1.8296e+10 14676
## + TrainingTimesLastYear     1 2.7961e+07 1.8342e+10 14676
## + EmployeeNumber            1 2.1490e+07 1.8348e+10 14676
## + OverTime                  1 1.1729e+07 1.8358e+10 14676
## + DistanceFromHome          1 8.1655e+05 1.8369e+10 14677
## + HourlyRate                1 1.0503e+05 1.8370e+10 14677
## + DailyRate                 1 1.4200e+02 1.8370e+10 14677
## + JobSatisfaction           3 5.6010e+07 1.8314e+10 14678
## + EducationField            5 1.1661e+08 1.8253e+10 14679
## + WorkLifeBalance           3 3.1221e+07 1.8338e+10 14680
## + EnvironmentSatisfaction   3 3.0267e+07 1.8339e+10 14680
## + RelationshipSatisfaction  3 2.0585e+07 1.8349e+10 14680
## + JobInvolvement            3 5.1176e+06 1.8365e+10 14681
## 
## Step:  AIC=12623.4
## MonthlyIncome ~ JobLevel
## 
##                            Df Sum of Sq        RSS   AIC
## + JobRole                   8 722387200 1011360314 12170
## + TotalWorkingYears         1  59348516 1674398998 12595
## + Department                2  53738569 1680008945 12600
## + Age                       1  18766260 1714981254 12616
## + BusinessTravel            2  22678748 1711068767 12616
## + DistanceFromHome          1  13740838 1720006676 12618
## + NumCompaniesWorked        1   8964138 1724783376 12621
## + YearsWithCurrManager      1   8790603 1724956912 12621
## + EducationField            5  23514517 1710232998 12622
## <none>                                  1733747514 12623
## + ID                        1   3750153 1729997361 12624
## + MaritalStatus             2   6608834 1727138681 12624
## + YearsInCurrentRole        1   2410053 1731337461 12624
## + Gender                    1   2066377 1731681137 12624
## + HourlyRate                1   1863245 1731884269 12624
## + JobInvolvement            3   9325826 1724421688 12625
## + EmployeeNumber            1   1152961 1732594553 12625
## + PerformanceRating         1    731501 1733016014 12625
## + MonthlyRate               1    723684 1733023830 12625
## + PercentSalaryHike         1    535748 1733211766 12625
## + YearsAtCompany            1    409075 1733338439 12625
## + TrainingTimesLastYear     1    394917 1733352598 12625
## + DailyRate                 1     50672 1733696843 12625
## + YearsSinceLastPromotion   1     43699 1733703815 12625
## + OverTime                  1     29064 1733718450 12625
## + Attrition                 1      7238 1733740276 12625
## + RelationshipSatisfaction  3   7455506 1726292008 12626
## + EnvironmentSatisfaction   3   6529811 1727217704 12626
## + StockOptionLevel          3   4430571 1729316944 12627
## + WorkLifeBalance           3   2755349 1730992165 12628
## + JobSatisfaction           3   1406896 1732340618 12629
## + Education                 4   5345894 1728401620 12629
## 
## Step:  AIC=12170.48
## MonthlyIncome ~ JobLevel + JobRole
## 
##                            Df Sum of Sq        RSS   AIC
## + TotalWorkingYears         1  41160686  970199629 12136
## + BusinessTravel            2  14967502  996392812 12162
## + YearsSinceLastPromotion   1   9817643 1001542671 12164
## + Age                       1   9789787 1001570528 12164
## + YearsAtCompany            1   4037001 1007323313 12169
## + NumCompaniesWorked        1   4036366 1007323948 12169
## + ID                        1   3714160 1007646154 12169
## + DistanceFromHome          1   3615890 1007744424 12169
## + MonthlyRate               1   3392275 1007968040 12170
## + DailyRate                 1   3029272 1008331042 12170
## + Education                 4   9867535 1001492780 12170
## + Gender                    1   2661752 1008698562 12170
## + YearsInCurrentRole        1   2566679 1008793635 12170
## + PerformanceRating         1   2497230 1008863084 12170
## <none>                                  1011360314 12170
## + EmployeeNumber            1   2146125 1009214189 12171
## + TrainingTimesLastYear     1    350625 1011009690 12172
## + Attrition                 1    159796 1011200518 12172
## + PercentSalaryHike         1     84128 1011276187 12172
## + HourlyRate                1     64570 1011295744 12172
## + YearsWithCurrManager      1     38404 1011321911 12172
## + OverTime                  1       950 1011359365 12172
## + Department                2   1652354 1009707960 12173
## + MaritalStatus             2   1504761 1009855553 12173
## + WorkLifeBalance           3   3656169 1007704146 12173
## + StockOptionLevel          3   2837486 1008522828 12174
## + RelationshipSatisfaction  3   2310928 1009049386 12174
## + JobSatisfaction           3   1851173 1009509141 12175
## + JobInvolvement            3   1831830 1009528484 12175
## + EnvironmentSatisfaction   3    274454 1011085860 12176
## + EducationField            5   1448401 1009911913 12179
## 
## Step:  AIC=12136.33
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears
## 
##                             Df Sum of Sq       RSS   AIC
## + TotalWorkingYears:JobRole  8  69573387 900626241 12088
## + BusinessTravel             2  14325268 955874361 12127
## + YearsWithCurrManager       1   3661817 966537812 12135
## + Gender                     1   3545977 966653651 12135
## + MonthlyRate                1   3453163 966746466 12135
## + ID                         1   3270835 966928793 12135
## + DailyRate                  1   3163711 967035918 12136
## + DistanceFromHome           1   3136452 967063177 12136
## <none>                                   970199629 12136
## + PerformanceRating          1   2189208 968010421 12136
## + EmployeeNumber             1   1996410 968203219 12136
## + YearsSinceLastPromotion    1   1318418 968881210 12137
## + Education                  4   7804863 962394766 12137
## + Attrition                  1    644137 969555492 12138
## + YearsAtCompany             1    586481 969613147 12138
## + Department                 2   2647259 967552370 12138
## + TrainingTimesLastYear      1    395815 969803813 12138
## + NumCompaniesWorked         1    288899 969910730 12138
## + YearsInCurrentRole         1    281184 969918445 12138
## + OverTime                   1     26815 970172814 12138
## + PercentSalaryHike          1     14208 970185420 12138
## + HourlyRate                 1     10665 970188964 12138
## + Age                        1      4644 970194985 12138
## + WorkLifeBalance            3   3373073 966826556 12139
## + MaritalStatus              2    950845 969248783 12140
## + RelationshipSatisfaction   3   2520254 967679375 12140
## + StockOptionLevel           3   1686890 968512738 12141
## + JobSatisfaction            3   1512963 968686666 12141
## + JobInvolvement             3   1096298 969103331 12141
## + EnvironmentSatisfaction    3     85820 970113809 12142
## + EducationField             5   1829621 968370008 12145
## 
## Step:  AIC=12087.59
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + JobRole:TotalWorkingYears
## 
##                            Df Sum of Sq       RSS   AIC
## + BusinessTravel            2  15130983 885495259 12077
## + DailyRate                 1   4259385 896366857 12086
## + Gender                    1   3421421 897204820 12086
## + ID                        1   3362793 897263448 12086
## + DistanceFromHome          1   2724073 897902169 12087
## + MonthlyRate               1   2551486 898074755 12087
## <none>                                  900626241 12088
## + PerformanceRating         1   1962232 898664009 12088
## + EmployeeNumber            1   1377351 899248891 12088
## + NumCompaniesWorked        1    933019 899693223 12089
## + Department                2   2953976 897672265 12089
## + YearsWithCurrManager      1    714405 899911836 12089
## + TrainingTimesLastYear     1    423706 900202535 12089
## + YearsAtCompany            1    187385 900438857 12089
## + YearsSinceLastPromotion   1    180031 900446210 12089
## + YearsInCurrentRole        1    147890 900478352 12090
## + HourlyRate                1     53797 900572444 12090
## + PercentSalaryHike         1     37841 900588400 12090
## + Age                       1     14316 900611925 12090
## + Attrition                 1      1835 900624406 12090
## + OverTime                  1       216 900626026 12090
## + Education                 4   5587649 895038592 12090
## + MaritalStatus             2    940889 899685352 12091
## + RelationshipSatisfaction  3   2180819 898445422 12092
## + JobSatisfaction           3   2081901 898544341 12092
## + WorkLifeBalance           3   1987383 898638858 12092
## + JobInvolvement            3   1756180 898870061 12092
## + StockOptionLevel          3    947934 899678308 12093
## + EnvironmentSatisfaction   3    876975 899749266 12093
## + EducationField            5   2191845 898434397 12096
## 
## Step:  AIC=12076.85
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel + 
##     JobRole:TotalWorkingYears
## 
##                                    Df Sum of Sq       RSS   AIC
## + DailyRate                         1   4685745 880809514 12074
## + Gender                            1   3617564 881877694 12075
## + ID                                1   3099132 882396126 12076
## + TotalWorkingYears:BusinessTravel  2   4861257 880634002 12076
## + MonthlyRate                       1   2640304 882854954 12076
## <none>                                          885495259 12077
## + DistanceFromHome                  1   1909965 883585294 12077
## + PerformanceRating                 1   1813918 883681341 12077
## + Department                        2   3725302 881769957 12077
## + EmployeeNumber                    1   1277817 884217442 12078
## + NumCompaniesWorked                1   1195368 884299891 12078
## + YearsWithCurrManager              1    460265 885034993 12078
## + YearsSinceLastPromotion           1    386696 885108563 12078
## + TrainingTimesLastYear             1    368327 885126932 12078
## + YearsAtCompany                    1    134176 885361082 12079
## + YearsInCurrentRole                1    132213 885363046 12079
## + HourlyRate                        1     74757 885420501 12079
## + Age                               1     60913 885434345 12079
## + OverTime                          1     24933 885470326 12079
## + PercentSalaryHike                 1     17907 885477352 12079
## + Attrition                         1      6338 885488921 12079
## + Education                         4   5473016 880022243 12080
## + MaritalStatus                     2    525792 884969467 12080
## + RelationshipSatisfaction          3   2087177 883408082 12081
## + JobSatisfaction                   3   1833142 883662117 12081
## + WorkLifeBalance                   3   1610533 883884726 12081
## + JobInvolvement                    3   1498776 883996483 12081
## + StockOptionLevel                  3   1053946 884441312 12082
## + EnvironmentSatisfaction           3    789969 884705289 12082
## + EducationField                    5   2005697 883489562 12085
## + BusinessTravel:JobRole           16   4474286 881020972 12104
## 
## Step:  AIC=12074.24
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel + 
##     DailyRate + JobRole:TotalWorkingYears
## 
##                                    Df Sum of Sq       RSS   AIC
## + Gender                            1   3717903 877091611 12073
## + TotalWorkingYears:BusinessTravel  2   4953275 875856239 12073
## + ID                                1   2850825 877958689 12073
## + MonthlyRate                       1   2489631 878319883 12074
## <none>                                          880809514 12074
## + DistanceFromHome                  1   1937404 878872110 12074
## + Department                        2   3892746 876916767 12074
## + PerformanceRating                 1   1690986 879118528 12075
## + EmployeeNumber                    1   1491301 879318213 12075
## + NumCompaniesWorked                1    999804 879809710 12075
## + YearsSinceLastPromotion           1    547055 880262459 12076
## + TrainingTimesLastYear             1    397499 880412015 12076
## + YearsWithCurrManager              1    334068 880475446 12076
## + YearsInCurrentRole                1    161981 880647533 12076
## + HourlyRate                        1    151536 880657978 12076
## + YearsAtCompany                    1     77052 880732462 12076
## + Age                               1     42012 880767502 12076
## + PercentSalaryHike                 1     38379 880771135 12076
## + OverTime                          1     29157 880780357 12076
## + Attrition                         1      1842 880807672 12076
## + Education                         4   5645632 875163882 12077
## + MaritalStatus                     2    453020 880356494 12078
## + RelationshipSatisfaction          3   1984894 878824620 12078
## + JobSatisfaction                   3   1887121 878922393 12078
## + WorkLifeBalance                   3   1669308 879140206 12079
## + JobInvolvement                    3   1349136 879460378 12079
## + StockOptionLevel                  3    948685 879860829 12079
## + EnvironmentSatisfaction           3    724954 880084560 12080
## + EducationField                    5   1985368 878824146 12082
## + BusinessTravel:JobRole           16   4305914 876503600 12102
## 
## Step:  AIC=12072.56
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel + 
##     DailyRate + Gender + JobRole:TotalWorkingYears
## 
##                                    Df Sum of Sq       RSS   AIC
## + TotalWorkingYears:BusinessTravel  2   4863805 872227806 12072
## + ID                                1   2783995 874307616 12072
## + MonthlyRate                       1   2362447 874729164 12072
## <none>                                          877091611 12073
## + DistanceFromHome                  1   1860842 875230769 12073
## + Department                        2   3739550 873352061 12073
## + PerformanceRating                 1   1605947 875485664 12073
## + EmployeeNumber                    1   1526961 875564650 12073
## + NumCompaniesWorked                1    966921 876124689 12074
## + YearsSinceLastPromotion           1    537724 876553887 12074
## + TrainingTimesLastYear             1    406328 876685283 12074
## + YearsInCurrentRole                1    251176 876840435 12074
## + YearsWithCurrManager              1    231171 876860440 12074
## + HourlyRate                        1    182167 876909444 12074
## + YearsAtCompany                    1     43746 877047865 12074
## + Age                               1     40204 877051407 12074
## + PercentSalaryHike                 1     39316 877052295 12074
## + OverTime                          1     34861 877056750 12074
## + Attrition                         1      5515 877086095 12075
## + Education                         4   5536801 871554810 12075
## + MaritalStatus                     2    653056 876438555 12076
## + WorkLifeBalance                   3   1799081 875292530 12077
## + JobSatisfaction                   3   1663257 875428354 12077
## + RelationshipSatisfaction          3   1612439 875479172 12077
## + JobInvolvement                    3   1244140 875847471 12077
## + StockOptionLevel                  3    982402 876109209 12078
## + EnvironmentSatisfaction           3    769376 876322235 12078
## + EducationField                    5   1844361 875247250 12081
## + BusinessTravel:JobRole           16   4139017 872952594 12100
## 
## Step:  AIC=12071.72
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel + 
##     DailyRate + Gender + JobRole:TotalWorkingYears + TotalWorkingYears:BusinessTravel
## 
##                            Df Sum of Sq       RSS   AIC
## + ID                        1   2348104 869879702 12071
## + MonthlyRate               1   2269073 869958733 12072
## + Department                2   4118126 868109680 12072
## <none>                                  872227806 12072
## + EmployeeNumber            1   1618510 870609296 12072
## + PerformanceRating         1   1499677 870728129 12072
## + DistanceFromHome          1   1429645 870798161 12072
## + NumCompaniesWorked        1    935399 871292407 12073
## + YearsSinceLastPromotion   1    481788 871746018 12073
## + TrainingTimesLastYear     1    321058 871906748 12073
## + YearsWithCurrManager      1    245844 871981962 12074
## + YearsInCurrentRole        1    170989 872056818 12074
## + HourlyRate                1     76665 872151141 12074
## + Attrition                 1     57292 872170514 12074
## + Age                       1     49458 872178348 12074
## + YearsAtCompany            1     39457 872188349 12074
## + OverTime                  1     38699 872189107 12074
## + PercentSalaryHike         1     21429 872206377 12074
## + MaritalStatus             2    772860 871454946 12075
## + Education                 4   4715081 867512725 12075
## + WorkLifeBalance           3   1811417 870416389 12076
## + RelationshipSatisfaction  3   1602719 870625087 12076
## + JobSatisfaction           3   1589572 870638234 12076
## + JobInvolvement            3   1392219 870835587 12076
## + EnvironmentSatisfaction   3    976818 871250988 12077
## + StockOptionLevel          3    717756 871510050 12077
## + EducationField            5   1826862 870400944 12080
## + BusinessTravel:JobRole   16   5677317 866550489 12098
## 
## Step:  AIC=12071.37
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel + 
##     DailyRate + Gender + ID + JobRole:TotalWorkingYears + TotalWorkingYears:BusinessTravel
## 
##                            Df Sum of Sq       RSS   AIC
## + MonthlyRate               1   2334251 867545451 12071
## + Department                2   4273767 865605935 12071
## <none>                                  869879702 12071
## + EmployeeNumber            1   1507053 868372649 12072
## + PerformanceRating         1   1473002 868406700 12072
## + DistanceFromHome          1   1238820 868640882 12072
## + NumCompaniesWorked        1    915086 868964616 12072
## + YearsSinceLastPromotion   1    520480 869359222 12073
## + TrainingTimesLastYear     1    360724 869518978 12073
## + YearsWithCurrManager      1    284460 869595241 12073
## + YearsInCurrentRole        1    113060 869766642 12073
## + HourlyRate                1     69926 869809775 12073
## + Age                       1     38634 869841068 12073
## + Attrition                 1     26958 869852743 12073
## + OverTime                  1     23823 869855878 12073
## + YearsAtCompany            1     19172 869860530 12073
## + PercentSalaryHike         1     15659 869864042 12073
## + Education                 4   4936752 864942950 12074
## + MaritalStatus             2    881442 868998260 12074
## + WorkLifeBalance           3   1734191 868145511 12076
## + JobSatisfaction           3   1627702 868252000 12076
## + RelationshipSatisfaction  3   1537445 868342257 12076
## + JobInvolvement            3   1172744 868706958 12076
## + EnvironmentSatisfaction   3   1047576 868832126 12076
## + StockOptionLevel          3    682525 869197177 12077
## + EducationField            5   1482235 868397466 12080
## + BusinessTravel:JobRole   16   5649937 864229765 12098
## 
## Step:  AIC=12071.04
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel + 
##     DailyRate + Gender + ID + MonthlyRate + JobRole:TotalWorkingYears + 
##     TotalWorkingYears:BusinessTravel
## 
##                            Df Sum of Sq       RSS   AIC
## + Department                2   4240996 863304454 12071
## <none>                                  867545451 12071
## + EmployeeNumber            1   1674053 865871398 12071
## + PerformanceRating         1   1495227 866050224 12072
## + DistanceFromHome          1   1289911 866255540 12072
## + NumCompaniesWorked        1    924489 866620962 12072
## + YearsSinceLastPromotion   1    477651 867067800 12073
## + YearsWithCurrManager      1    405016 867140435 12073
## + TrainingTimesLastYear     1    340223 867205228 12073
## + YearsInCurrentRole        1    106127 867439324 12073
## + HourlyRate                1     83782 867461669 12073
## + Age                       1     71994 867473457 12073
## + YearsAtCompany            1     66091 867479360 12073
## + Attrition                 1     39060 867506391 12073
## + OverTime                  1     25310 867520141 12073
## + PercentSalaryHike         1     14288 867531163 12073
## + MaritalStatus             2    929070 866616381 12074
## + Education                 4   4777490 862767961 12074
## + WorkLifeBalance           3   1905462 865639989 12075
## + JobSatisfaction           3   1720853 865824597 12075
## + RelationshipSatisfaction  3   1393023 866152428 12076
## + JobInvolvement            3   1282954 866262497 12076
## + EnvironmentSatisfaction   3    935892 866609559 12076
## + StockOptionLevel          3    744152 866801299 12076
## + EducationField            5   1500745 866044705 12080
## + BusinessTravel:JobRole   16   5431376 862114075 12098
## 
## Step:  AIC=12070.77
## MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel + 
##     DailyRate + Gender + ID + MonthlyRate + Department + JobRole:TotalWorkingYears + 
##     TotalWorkingYears:BusinessTravel
## 
##                                Df Sum of Sq       RSS   AIC
## <none>                                      863304454 12071
## + PerformanceRating             1   1519041 861785414 12071
## + EmployeeNumber                1   1442015 861862439 12071
## + DistanceFromHome              1   1152915 862151539 12072
## + NumCompaniesWorked            1   1035125 862269329 12072
## + YearsSinceLastPromotion       1    422382 862882072 12072
## + YearsWithCurrManager          1    365374 862939080 12072
## + TrainingTimesLastYear         1    308912 862995543 12072
## + TotalWorkingYears:Department  2   2251394 861053061 12072
## + YearsInCurrentRole            1     93417 863211037 12073
## + HourlyRate                    1     83015 863221439 12073
## + YearsAtCompany                1     44217 863260237 12073
## + Attrition                     1     35467 863268988 12073
## + Age                           1     30786 863273668 12073
## + OverTime                      1     30614 863273841 12073
## + PercentSalaryHike             1     15683 863288772 12073
## + MaritalStatus                 2    847357 862457098 12074
## + Education                     4   4622165 858682289 12074
## + JobInvolvement                3   1504123 861800332 12075
## + WorkLifeBalance               3   1484778 861819677 12075
## + RelationshipSatisfaction      3   1420059 861884395 12075
## + JobSatisfaction               3   1386061 861918393 12075
## + EnvironmentSatisfaction       3   1074435 862230020 12076
## + StockOptionLevel              3    705555 862598899 12076
## + BusinessTravel:Department     4   1334798 861969656 12077
## + EducationField                5   1318335 861986120 12079
## + BusinessTravel:JobRole       16   5384523 857919931 12097
coef(model.forward)
##                                       (Intercept) 
##                                     -1.265299e+03 
##                                          JobLevel 
##                                      2.767041e+03 
##                            JobRoleHuman Resources 
##                                      6.480376e+02 
##                      JobRoleLaboratory Technician 
##                                      4.960850e+02 
##                                    JobRoleManager 
##                                      4.444156e+03 
##                     JobRoleManufacturing Director 
##                                     -2.079844e+02 
##                          JobRoleResearch Director 
##                                      4.607222e+03 
##                         JobRoleResearch Scientist 
##                                      4.109792e+02 
##                            JobRoleSales Executive 
##                                      4.254338e+02 
##                       JobRoleSales Representative 
##                                      6.843187e+02 
##                                 TotalWorkingYears 
##                                      9.240775e+01 
##                   BusinessTravelTravel_Frequently 
##                                      5.983496e+02 
##                       BusinessTravelTravel_Rarely 
##                                      5.200031e+02 
##                                         DailyRate 
##                                      1.859171e-01 
##                                        GenderMale 
##                                      1.270529e+02 
##                                                ID 
##                                     -2.201902e-01 
##                                       MonthlyRate 
##                                     -7.350286e-03 
##                  DepartmentResearch & Development 
##                                      3.078081e+02 
##                                   DepartmentSales 
##                                     -3.360249e+02 
##          JobRoleHuman Resources:TotalWorkingYears 
##                                     -7.836099e+01 
##    JobRoleLaboratory Technician:TotalWorkingYears 
##                                     -1.211658e+02 
##                  JobRoleManager:TotalWorkingYears 
##                                     -1.864787e+01 
##   JobRoleManufacturing Director:TotalWorkingYears 
##                                      2.945552e+01 
##        JobRoleResearch Director:TotalWorkingYears 
##                                     -3.820688e+01 
##       JobRoleResearch Scientist:TotalWorkingYears 
##                                     -7.756305e+01 
##          JobRoleSales Executive:TotalWorkingYears 
##                                      2.001124e+01 
##     JobRoleSales Representative:TotalWorkingYears 
##                                     -4.755616e+01 
## TotalWorkingYears:BusinessTravelTravel_Frequently 
##                                     -3.259974e+01 
##     TotalWorkingYears:BusinessTravelTravel_Rarely 
##                                     -9.623839e+00
#fit the model
lm.forward <- lm(MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + BusinessTravel + 
    DailyRate + Gender + ID + MonthlyRate + Department + JobRole:TotalWorkingYears + 
    TotalWorkingYears:BusinessTravel, data=train)

#check residuals and summary
summary(lm.forward)
## 
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobRole + TotalWorkingYears + 
##     BusinessTravel + DailyRate + Gender + ID + MonthlyRate + 
##     Department + JobRole:TotalWorkingYears + TotalWorkingYears:BusinessTravel, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3134.1  -612.2  -106.4   635.4  4433.8 
## 
## Coefficients:
##                                                     Estimate Std. Error t value
## (Intercept)                                       -7.381e+02  9.446e+02  -0.781
## JobLevel                                           2.784e+03  1.138e+02  24.458
## JobRoleHuman Resources                             3.082e+02  1.058e+03   0.291
## JobRoleLaboratory Technician                       2.226e+02  4.167e+02   0.534
## JobRoleManager                                     4.053e+03  8.607e+02   4.708
## JobRoleManufacturing Director                     -6.747e+02  4.883e+02  -1.382
## JobRoleResearch Director                           3.885e+03  6.797e+02   5.715
## JobRoleResearch Scientist                          1.801e+02  4.279e+02   0.421
## JobRoleSales Executive                             1.422e+02  5.821e+02   0.244
## JobRoleSales Representative                        6.016e+02  6.393e+02   0.941
## TotalWorkingYears                                  5.942e+01  3.064e+01   1.939
## BusinessTravelTravel_Frequently                    5.352e+02  3.141e+02   1.704
## BusinessTravelTravel_Rarely                        5.864e+02  2.681e+02   2.187
## DailyRate                                          8.080e-02  1.225e-01   0.660
## GenderMale                                         4.808e+01  1.004e+02   0.479
## ID                                                -2.902e-01  1.983e-01  -1.464
## MonthlyRate                                       -6.316e-03  6.862e-03  -0.920
## DepartmentResearch & Development                   1.858e+02  7.966e+02   0.233
## DepartmentSales                                   -4.558e+02  7.759e+02  -0.587
## JobRoleHuman Resources:TotalWorkingYears          -5.504e+01  8.609e+01  -0.639
## JobRoleLaboratory Technician:TotalWorkingYears    -1.187e+02  3.032e+01  -3.915
## JobRoleManager:TotalWorkingYears                   1.463e-01  3.785e+01   0.004
## JobRoleManufacturing Director:TotalWorkingYears    5.033e+01  3.090e+01   1.629
## JobRoleResearch Director:TotalWorkingYears         6.668e+00  3.591e+01   0.186
## JobRoleResearch Scientist:TotalWorkingYears       -6.309e+01  3.213e+01  -1.963
## JobRoleSales Executive:TotalWorkingYears           3.629e+01  2.612e+01   1.389
## JobRoleSales Representative:TotalWorkingYears     -2.538e+01  5.953e+01  -0.426
## TotalWorkingYears:BusinessTravelTravel_Frequently -1.471e+01  2.651e+01  -0.555
## TotalWorkingYears:BusinessTravelTravel_Rarely      1.451e+00  2.365e+01   0.061
##                                                   Pr(>|t|)    
## (Intercept)                                       0.434999    
## JobLevel                                           < 2e-16 ***
## JobRoleHuman Resources                            0.770936    
## JobRoleLaboratory Technician                      0.593567    
## JobRoleManager                                    3.43e-06 ***
## JobRoleManufacturing Director                     0.167782    
## JobRoleResearch Director                          2.13e-08 ***
## JobRoleResearch Scientist                         0.674051    
## JobRoleSales Executive                            0.807192    
## JobRoleSales Representative                       0.347287    
## TotalWorkingYears                                 0.053159 .  
## BusinessTravelTravel_Frequently                   0.089090 .  
## BusinessTravelTravel_Rarely                       0.029309 *  
## DailyRate                                         0.509781    
## GenderMale                                        0.632184    
## ID                                                0.144020    
## MonthlyRate                                       0.357867    
## DepartmentResearch & Development                  0.815677    
## DepartmentSales                                   0.557279    
## JobRoleHuman Resources:TotalWorkingYears          0.522987    
## JobRoleLaboratory Technician:TotalWorkingYears    0.000106 ***
## JobRoleManager:TotalWorkingYears                  0.996917    
## JobRoleManufacturing Director:TotalWorkingYears   0.104087    
## JobRoleResearch Director:TotalWorkingYears        0.852779    
## JobRoleResearch Scientist:TotalWorkingYears       0.050296 .  
## JobRoleSales Executive:TotalWorkingYears          0.165580    
## JobRoleSales Representative:TotalWorkingYears     0.670136    
## TotalWorkingYears:BusinessTravelTravel_Frequently 0.579223    
## TotalWorkingYears:BusinessTravelTravel_Rarely     0.951100    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 998.4 on 406 degrees of freedom
## Multiple R-squared:  0.9526, Adjusted R-squared:  0.9493 
## F-statistic: 291.4 on 28 and 406 DF,  p-value: < 2.2e-16
plot(lm.forward)

#prediction error
preds.lm.forward <- predict(lm.forward, test)
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.lm.forward, test$MonthlyIncome)
## [1] 1048.446

EDA for KNN for income LR EDA

 #Plots of working years vs income, colored by various elements to see if they make a good split
 employees  %>%
   mutate(PromotionBins = cut(employees$YearsSinceLastPromotion, 10)) %>%
   ggplot(aes(x=TotalWorkingYears, MonthlyIncome, colour=PromotionBins)) +
   geom_point()

 employees  %>%
   mutate(TenureBins = cut(employees$YearsAtCompany, 10)) %>%
   ggplot(aes(x=TotalWorkingYears, MonthlyIncome, colour=TenureBins)) +
   geom_point()

 employees  %>%
   mutate(ManagerBins = cut(employees$YearsWithCurrManager, 10)) %>%
   ggplot(aes(x=TotalWorkingYears, MonthlyIncome, colour=ManagerBins)) +
   geom_point()

 employees  %>%
   mutate(RoleBins = cut(employees$YearsInCurrentRole, 10)) %>%
   ggplot(aes(x=TotalWorkingYears, MonthlyIncome, colour=RoleBins)) +
   geom_point()

 employees  %>%
   ggplot(aes(x=TotalWorkingYears, MonthlyIncome, colour=factor(JobLevel))) +
   geom_point()

 employees  %>%
   mutate(IncomeBins = cut(employees$MonthlyIncome, 10)) %>%
   ggplot(aes(x=TotalWorkingYears, JobLevel, colour=IncomeBins)) +
   geom_point()

 employees_yrs <- employees  %>%
   select(c(TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager))

 employees_pcs <- prcomp(employees_yrs, scale = TRUE)

 employees %>%
   ggplot(aes(x=employees_pcs$x[,1], y=MonthlyIncome)) +
   geom_point()

 employees %>%
   mutate(IncomeBins = cut(employees$MonthlyIncome, 10)) %>%
   mutate(PC1 = employees_pcs$x[,1]) %>%
   mutate(PC2 = employees_pcs$x[,2]) %>%
 ggplot(aes(x=PC1, y=PC2, colour=IncomeBins)) +
   geom_point()

KNN regression

employees_z2 <- employees_z %>%
  mutate(MonthlyIncome = employees_num$MonthlyIncome) %>%
  mutate(zAttrition = scale(as.numeric(Attrition)))
employees_z2$Attrition <- NULL
employees_z2$zMonthlyIncome <- NULL
employees_z2$PC1 <- employees_pcs$x[,1]
employees_z2$PC2 <- employees_pcs$x[,2]
employees_z2$PC3 <- employees_pcs$x[,3]

#train test split
folds <- createFolds(employees_z2$MonthlyIncome, k=2)
train <- employees_z2[folds$Fold1,]
test <- employees_z2[folds$Fold2,]

##run knn with everything
preds.knn <- knn(train[, names(train) != "MonthlyIncome"], test[, names(train) != "MonthlyIncome"], cl=train$MonthlyIncome, k=10)
preds.knn <- as.numeric(preds.knn)
plot(x=preds.knn, y=test$MonthlyIncome)

#gross
print("RMSE:")
## [1] "RMSE:"
RMSE(preds.knn, test$MonthlyIncome)
## [1] 7701.547
##run knn with just a few manually selected variables
preds.knn <- knn(train[, c("zJobRole", "zJobLevel", "zTotalWorkingYears")], test[, c("zJobRole", "zJobLevel", "zTotalWorkingYears")], cl=train$MonthlyIncome, k=10)
preds.knn <- as.numeric(preds.knn)
plot(x=preds.knn, y=test$MonthlyIncome)

print("RMSE:")
## [1] "RMSE:"
RMSE(preds.knn, test$MonthlyIncome)
## [1] 7675.329
#not bad

##run knn with even fewer manually selected variables
preds.knn <- knn(train[, c("zJobLevel", "zTotalWorkingYears")], test[, c("zJobLevel", "zTotalWorkingYears")], cl=train$MonthlyIncome, k=10)
preds.knn <- as.numeric(preds.knn)
plot(x=preds.knn, y=test$MonthlyIncome)

print("RMSE:")
## [1] "RMSE:"
RMSE(preds.knn, test$MonthlyIncome)
## [1] 7677.189
#not bad

##run knn with PCs
preds.knn <- knn(train[, c("zJobRole", "zJobLevel", "PC1", "PC2", "PC3")], test[, c("zJobRole", "zJobLevel", "PC1", "PC2", "PC3")], cl=train$MonthlyIncome, k=10)
preds.knn <- as.numeric(preds.knn)
plot(x=preds.knn, y=test$MonthlyIncome)

print("RMSE:")
## [1] "RMSE:"
RMSE(preds.knn, test$MonthlyIncome)
## [1] 7685.852